{
    "cells": [
        {
            "cell_type": "markdown",
            "id": "30146ad2-f165-4f4b-ae07-fe6597a2964f",
            "metadata": {},
            "source": [
                "# Web Page Reader\n",
                "\n",
                "Demonstrates our web page reader."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "3c39063b",
            "metadata": {},
            "outputs": [],
            "source": [
                "import logging\n",
                "import sys\n",
                "\n",
                "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
                "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "2315a154-f72d-4447-b1eb-cde9b66868cb",
            "metadata": {},
            "source": [
                "#### Using SimpleWebPageReader"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "87bf7ecd-50cd-47da-9f0e-bc48d7ae45d8",
            "metadata": {},
            "outputs": [],
            "source": [
                "from llama_index import SummaryIndex, SimpleWebPageReader\n",
                "from IPython.display import Markdown, display\n",
                "import os"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "b6de3929-51eb-4064-b4b6-c203bb6debc4",
            "metadata": {},
            "outputs": [],
            "source": [
                "# NOTE: the html_to_text=True option requires html2text to be installed"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "663403de-2e6e-4340-ab8f-8ee681bc06aa",
            "metadata": {},
            "outputs": [],
            "source": [
                "documents = SimpleWebPageReader(html_to_text=True).load_data(\n",
                "    [\"http://paulgraham.com/worked.html\"]\n",
                ")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "b8cd183a-2423-4a3e-ad92-dfe89ed5454e",
            "metadata": {},
            "outputs": [],
            "source": [
                "documents[0]"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "26854cc3-af61-4910-ab6b-3bed6acfb447",
            "metadata": {},
            "outputs": [],
            "source": [
                "index = SummaryIndex.from_documents(documents)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "5cfdf87a-97cb-481f-ad51-be5bf8b5217f",
            "metadata": {},
            "outputs": [],
            "source": [
                "# set Logging to DEBUG for more detailed outputs\n",
                "query_engine = index.as_query_engine()\n",
                "response = query_engine.query(\"What did the author do growing up?\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "7278d033-cae3-4ddf-96bd-75ea570ca53f",
            "metadata": {},
            "outputs": [],
            "source": [
                "display(Markdown(f\"<b>{response}</b>\"))"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "2708dc99-0e4d-4c7e-b180-8392286d87c2",
            "metadata": {},
            "source": [
                "#### Using TrafilaturaWebReader"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "aa2d54c6-c694-4852-a743-165e4777bd56",
            "metadata": {},
            "outputs": [],
            "source": [
                "from llama_index import TrafilaturaWebReader"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "46854f2f-426e-40a3-a87f-5fb51f90e14c",
            "metadata": {},
            "outputs": [],
            "source": [
                "documents = TrafilaturaWebReader().load_data([\"http://paulgraham.com/worked.html\"])"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "80752ad3-1ed8-4695-9247-22efbe475746",
            "metadata": {},
            "outputs": [],
            "source": [
                "index = SummaryIndex.from_documents(documents)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "8cc9b154-1dcf-479b-b49b-251874aea506",
            "metadata": {},
            "outputs": [],
            "source": [
                "# set Logging to DEBUG for more detailed outputs\n",
                "query_engine = index.as_query_engine()\n",
                "response = query_engine.query(\"What did the author do growing up?\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "971b6415-8bcd-4d8b-a1de-9b7ada3cd392",
            "metadata": {},
            "outputs": [],
            "source": [
                "display(Markdown(f\"<b>{response}</b>\"))"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "b2b6d07c",
            "metadata": {},
            "source": [
                "### Using RssReader"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "a5ad5ca8",
            "metadata": {},
            "outputs": [],
            "source": [
                "from llama_index import SummaryIndex, RssReader\n",
                "\n",
                "documents = RssReader().load_data(\n",
                "    [\"https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml\"]\n",
                ")\n",
                "\n",
                "index = SummaryIndex.from_documents(documents)\n",
                "\n",
                "# set Logging to DEBUG for more detailed outputs\n",
                "query_engine = index.as_query_engine()\n",
                "response = query_engine.query(\"What happened in the news today?\")"
            ]
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "Python 3 (ipykernel)",
            "language": "python",
            "name": "python3"
        },
        "language_info": {
            "codemirror_mode": {
                "name": "ipython",
                "version": 3
            },
            "file_extension": ".py",
            "mimetype": "text/x-python",
            "name": "python",
            "nbconvert_exporter": "python",
            "pygments_lexer": "ipython3",
            "version": "3.11.1"
        },
        "vscode": {
            "interpreter": {
                "hash": "c32397a35d2e76e766f80c3872b208f0c0029e8a6a9b8e2a8fe7b1641cfa009b"
            }
        }
    },
    "nbformat": 4,
    "nbformat_minor": 5
}
